{
 "cells": [
  {
   "cell_type": "markdown",
   "id": "c0a043d2-2fe3-40bd-bf61-b6e12a218a42",
   "metadata": {},
   "source": [
    "# Introduction"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "554f934e-b66f-4633-bc93-8a51edaa25b0",
   "metadata": {},
   "outputs": [],
   "source": [
    "!python -m pip install polars\n",
    "!python -m pip install matplotlib"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "ec44a544-7828-461e-b735-536f3fe6fd16",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div><style>\n",
       ".dataframe > thead > tr,\n",
       ".dataframe > tbody > tr {\n",
       "  text-align: right;\n",
       "  white-space: pre-wrap;\n",
       "}\n",
       "</style>\n",
       "<small>shape: (3_076_903, 5)</small><table border=\"1\" class=\"dataframe\"><thead><tr><th>pick_up</th><th>drop_off</th><th>passengers</th><th>distance</th><th>fare</th></tr><tr><td>str</td><td>str</td><td>i32</td><td>i32</td><td>i32</td></tr></thead><tbody><tr><td>&quot;Manhattan&quot;</td><td>&quot;Manhattan&quot;</td><td>1</td><td>3</td><td>24</td></tr><tr><td>&quot;Queens&quot;</td><td>&quot;Manhattan&quot;</td><td>1</td><td>19</td><td>75</td></tr><tr><td>&quot;Manhattan&quot;</td><td>&quot;Queens&quot;</td><td>1</td><td>1</td><td>16</td></tr><tr><td>&quot;Queens&quot;</td><td>&quot;Manhattan&quot;</td><td>0</td><td>9</td><td>60</td></tr><tr><td>&quot;Queens&quot;</td><td>&quot;Manhattan&quot;</td><td>1</td><td>17</td><td>90</td></tr><tr><td>&hellip;</td><td>&hellip;</td><td>&hellip;</td><td>&hellip;</td><td>&hellip;</td></tr><tr><td>&quot;Manhattan&quot;</td><td>&quot;Manhattan&quot;</td><td>null</td><td>5</td><td>27</td></tr><tr><td>&quot;Manhattan&quot;</td><td>&quot;Manhattan&quot;</td><td>null</td><td>4</td><td>26</td></tr><tr><td>&quot;Queens&quot;</td><td>&quot;Brooklyn&quot;</td><td>null</td><td>4</td><td>26</td></tr><tr><td>&quot;Manhattan&quot;</td><td>&quot;Manhattan&quot;</td><td>null</td><td>3</td><td>24</td></tr><tr><td>&quot;Manhattan&quot;</td><td>&quot;Manhattan&quot;</td><td>null</td><td>8</td><td>35</td></tr></tbody></table></div>"
      ],
      "text/plain": [
       "shape: (3_076_903, 5)\n",
       "┌───────────┬───────────┬────────────┬──────────┬──────┐\n",
       "│ pick_up   ┆ drop_off  ┆ passengers ┆ distance ┆ fare │\n",
       "│ ---       ┆ ---       ┆ ---        ┆ ---      ┆ ---  │\n",
       "│ str       ┆ str       ┆ i32        ┆ i32      ┆ i32  │\n",
       "╞═══════════╪═══════════╪════════════╪══════════╪══════╡\n",
       "│ Manhattan ┆ Manhattan ┆ 1          ┆ 3        ┆ 24   │\n",
       "│ Queens    ┆ Manhattan ┆ 1          ┆ 19       ┆ 75   │\n",
       "│ Manhattan ┆ Queens    ┆ 1          ┆ 1        ┆ 16   │\n",
       "│ Queens    ┆ Manhattan ┆ 0          ┆ 9        ┆ 60   │\n",
       "│ Queens    ┆ Manhattan ┆ 1          ┆ 17       ┆ 90   │\n",
       "│ …         ┆ …         ┆ …          ┆ …        ┆ …    │\n",
       "│ Manhattan ┆ Manhattan ┆ null       ┆ 5        ┆ 27   │\n",
       "│ Manhattan ┆ Manhattan ┆ null       ┆ 4        ┆ 26   │\n",
       "│ Queens    ┆ Brooklyn  ┆ null       ┆ 4        ┆ 26   │\n",
       "│ Manhattan ┆ Manhattan ┆ null       ┆ 3        ┆ 24   │\n",
       "│ Manhattan ┆ Manhattan ┆ null       ┆ 8        ┆ 35   │\n",
       "└───────────┴───────────┴────────────┴──────────┴──────┘"
      ]
     },
     "execution_count": 1,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "import polars as pl\n",
    "\n",
    "rides = pl.scan_parquet(\"rides.parquet\")\n",
    "\n",
    "rides.collect()"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "c4665acc-d207-4a4c-bdc6-10f436e32a99",
   "metadata": {},
   "source": [
    "# How to Create a Polars LazyFrame"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "784b7616-643a-46bc-b449-0fbc5f390178",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div><style>\n",
       ".dataframe > thead > tr,\n",
       ".dataframe > tbody > tr {\n",
       "  text-align: right;\n",
       "  white-space: pre-wrap;\n",
       "}\n",
       "</style>\n",
       "<small>shape: (5, 4)</small><table border=\"1\" class=\"dataframe\"><thead><tr><th>language_id</th><th>language</th><th>creator</th><th>year</th></tr><tr><td>i64</td><td>str</td><td>str</td><td>i64</td></tr></thead><tbody><tr><td>0</td><td>&quot;Pascal&quot;</td><td>&quot;Niklaus Wirth&quot;</td><td>1970</td></tr><tr><td>1</td><td>&quot;C&quot;</td><td>&quot;Dennis Ritchie&quot;</td><td>1973</td></tr><tr><td>2</td><td>&quot;C++&quot;</td><td>&quot;Bjarne Stroustrup&quot;</td><td>1985</td></tr><tr><td>3</td><td>&quot;Python&quot;</td><td>&quot;Guido van Rossum&quot;</td><td>1991</td></tr><tr><td>4</td><td>&quot;Java&quot;</td><td>&quot;James Gosling&quot;</td><td>1995</td></tr></tbody></table></div>"
      ],
      "text/plain": [
       "shape: (5, 4)\n",
       "┌─────────────┬──────────┬───────────────────┬──────┐\n",
       "│ language_id ┆ language ┆ creator           ┆ year │\n",
       "│ ---         ┆ ---      ┆ ---               ┆ ---  │\n",
       "│ i64         ┆ str      ┆ str               ┆ i64  │\n",
       "╞═════════════╪══════════╪═══════════════════╪══════╡\n",
       "│ 0           ┆ Pascal   ┆ Niklaus Wirth     ┆ 1970 │\n",
       "│ 1           ┆ C        ┆ Dennis Ritchie    ┆ 1973 │\n",
       "│ 2           ┆ C++      ┆ Bjarne Stroustrup ┆ 1985 │\n",
       "│ 3           ┆ Python   ┆ Guido van Rossum  ┆ 1991 │\n",
       "│ 4           ┆ Java     ┆ James Gosling     ┆ 1995 │\n",
       "└─────────────┴──────────┴───────────────────┴──────┘"
      ]
     },
     "execution_count": 2,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "import polars as pl\n",
    "\n",
    "programming_languages = {\n",
    "    \"language_id\": range(5),\n",
    "    \"language\": [\"Pascal\", \"C\", \"C++\", \"Python\", \"Java\"],\n",
    "    \"creator\": [\n",
    "        \"Niklaus Wirth\",\n",
    "        \"Dennis Ritchie\",\n",
    "        \"Bjarne Stroustrup\",\n",
    "        \"Guido van Rossum\",\n",
    "        \"James Gosling\",\n",
    "    ],\n",
    "    \"year\": [1970, 1973, 1985, 1991, 1995],\n",
    "}\n",
    "\n",
    "pl.LazyFrame(programming_languages).collect()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "168d4db9-87c0-4ce2-aaab-35f128a03788",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div><style>\n",
       ".dataframe > thead > tr,\n",
       ".dataframe > tbody > tr {\n",
       "  text-align: right;\n",
       "  white-space: pre-wrap;\n",
       "}\n",
       "</style>\n",
       "<small>shape: (5, 4)</small><table border=\"1\" class=\"dataframe\"><thead><tr><th>language_id</th><th>language</th><th>creator</th><th>year</th></tr><tr><td>i32</td><td>str</td><td>str</td><td>i32</td></tr></thead><tbody><tr><td>0</td><td>&quot;Pascal&quot;</td><td>&quot;Niklaus Wirth&quot;</td><td>1970</td></tr><tr><td>1</td><td>&quot;C&quot;</td><td>&quot;Dennis Ritchie&quot;</td><td>1973</td></tr><tr><td>2</td><td>&quot;C++&quot;</td><td>&quot;Bjarne Stroustrup&quot;</td><td>1985</td></tr><tr><td>3</td><td>&quot;Python&quot;</td><td>&quot;Guido van Rossum&quot;</td><td>1991</td></tr><tr><td>4</td><td>&quot;Java&quot;</td><td>&quot;James Gosling&quot;</td><td>1995</td></tr></tbody></table></div>"
      ],
      "text/plain": [
       "shape: (5, 4)\n",
       "┌─────────────┬──────────┬───────────────────┬──────┐\n",
       "│ language_id ┆ language ┆ creator           ┆ year │\n",
       "│ ---         ┆ ---      ┆ ---               ┆ ---  │\n",
       "│ i32         ┆ str      ┆ str               ┆ i32  │\n",
       "╞═════════════╪══════════╪═══════════════════╪══════╡\n",
       "│ 0           ┆ Pascal   ┆ Niklaus Wirth     ┆ 1970 │\n",
       "│ 1           ┆ C        ┆ Dennis Ritchie    ┆ 1973 │\n",
       "│ 2           ┆ C++      ┆ Bjarne Stroustrup ┆ 1985 │\n",
       "│ 3           ┆ Python   ┆ Guido van Rossum  ┆ 1991 │\n",
       "│ 4           ┆ Java     ┆ James Gosling     ┆ 1995 │\n",
       "└─────────────┴──────────┴───────────────────┴──────┘"
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "import polars as pl\n",
    "\n",
    "programming_languages = {\n",
    "    \"language_id\": range(5),\n",
    "    \"language\": [\"Pascal\", \"C\", \"C++\", \"Python\", \"Java\"],\n",
    "    \"creator\": [\n",
    "        \"Niklaus Wirth\",\n",
    "        \"Dennis Ritchie\",\n",
    "        \"Bjarne Stroustrup\",\n",
    "        \"Guido van Rossum\",\n",
    "        \"James Gosling\",\n",
    "    ],\n",
    "    \"year\": [1970, 1973, 1985, 1991, 1995],\n",
    "}\n",
    "\n",
    "d_types = {\n",
    "    \"language_id\": pl.Int32,\n",
    "    \"language\": pl.String,\n",
    "    \"creator\": pl.String,\n",
    "    \"year\": pl.Int32,\n",
    "}\n",
    "\n",
    "languages = pl.LazyFrame(programming_languages, schema=d_types)\n",
    "languages.collect()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "8ee12257-f59c-4166-ab6e-3802a5a4d9b9",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Schema([('language_id', Int32),\n",
       "        ('language', String),\n",
       "        ('creator', String),\n",
       "        ('year', Int32)])"
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "languages.collect_schema()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "66b4ea00-1afd-4220-92b8-2a99f81b5c30",
   "metadata": {},
   "outputs": [],
   "source": [
    "languages_df = pl.read_csv(\"programming_languages.csv\")\n",
    "type(languages_df)\n",
    "\n",
    "\n",
    "languages_lf = languages_df.lazy()\n",
    "type(languages_lf)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "bb75309e-777b-48ec-b340-f9233586741c",
   "metadata": {},
   "source": [
    "# How to Monitor LazyFrame Efficiency"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "06aa3446-86ca-43f7-b88c-1ff8d53b0597",
   "metadata": {},
   "outputs": [],
   "source": [
    "# dataframe_timer.py\n",
    "\n",
    "import time\n",
    "\n",
    "import polars as pl\n",
    "\n",
    "start = time.perf_counter()\n",
    "\n",
    "for _ in range(10):\n",
    "    rides = pl.read_parquet(\"rides.parquet\")\n",
    "    result = (\n",
    "        rides.filter(pl.col(\"pick_up\") == pl.col(\"drop_off\"))\n",
    "        .group_by(pl.col(\"pick_up\"))\n",
    "        .agg(pl.col(\"fare\").mean())\n",
    "        .filter(\n",
    "            pl.col(\"pick_up\").is_in(\n",
    "                [\"Brooklyn\", \"Bronx\", \"Queens\", \"Manhattan\"]\n",
    "            )\n",
    "        )\n",
    "    )\n",
    "\n",
    "end = time.perf_counter()\n",
    "\n",
    "f\"Code finished in {(end - start) / 10:0.4f} seconds.\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "62c57ae3-7eae-4f61-948a-50e31df28eb9",
   "metadata": {},
   "outputs": [],
   "source": [
    "# lazyframe_timer.py\n",
    "\n",
    "import time\n",
    "\n",
    "import polars as pl\n",
    "\n",
    "start = time.perf_counter()\n",
    "\n",
    "for _ in range(10):\n",
    "    rides = pl.scan_parquet(\"rides.parquet\")\n",
    "    result = (\n",
    "        rides.filter(pl.col(\"pick_up\") == pl.col(\"drop_off\"))\n",
    "        .group_by(pl.col(\"pick_up\"))\n",
    "        .agg(pl.col(\"fare\").mean())\n",
    "        .filter(\n",
    "            pl.col(\"pick_up\").is_in(\n",
    "                [\"Brooklyn\", \"Bronx\", \"Queens\", \"Manhattan\"]\n",
    "            )\n",
    "        )\n",
    "    ).collect()\n",
    "\n",
    "end = time.perf_counter()\n",
    "\n",
    "f\"Code finished in {(end - start) / 10:0.4f} seconds.\""
   ]
  },
  {
   "cell_type": "markdown",
   "id": "723df3df-44e0-40d6-a3d6-22797c37b994",
   "metadata": {},
   "source": [
    "# How LazyFrames Achieve Efficiency\n",
    "## Investigating the Sub-Optimized Query Plan"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "id": "48a7ae3f-d906-4c96-b8c1-7bf200a97c64",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "FILTER col(\"pick_up\").is_in([Series]) FROM\n",
      "  AGGREGATE\n",
      "  \t[col(\"fare\").mean()] BY [col(\"pick_up\")] FROM\n",
      "    FILTER [(col(\"pick_up\")) == (col(\"drop_off\"))] FROM\n",
      "      Parquet SCAN [rides.parquet]\n",
      "      PROJECT */5 COLUMNS\n"
     ]
    }
   ],
   "source": [
    "import polars as pl\n",
    "\n",
    "rides = pl.scan_parquet(\"rides.parquet\")\n",
    "\n",
    "print(\n",
    "    (\n",
    "        rides.filter(pl.col(\"pick_up\") == pl.col(\"drop_off\"))\n",
    "        .group_by(pl.col(\"pick_up\"))\n",
    "        .agg(pl.col(\"fare\").mean())\n",
    "        .filter(\n",
    "            pl.col(\"pick_up\").is_in(\n",
    "                [\"Brooklyn\", \"Bronx\", \"Queens\", \"Manhattan\"]\n",
    "            )\n",
    "        )\n",
    "    ).explain(optimized=False)\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "id": "84df6a05-d10d-48bd-9216-d65a143c21dc",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "image/svg+xml": [
       "<svg xmlns=\"http://www.w3.org/2000/svg\" xmlns:xlink=\"http://www.w3.org/1999/xlink\" width=\"312pt\" height=\"287pt\" viewBox=\"0.00 0.00 312.00 286.50\">\n",
       "<g id=\"graph0\" class=\"graph\" transform=\"scale(1 1) rotate(0) translate(4 282.5)\">\n",
       "<title>polars_query</title>\n",
       "<polygon fill=\"white\" stroke=\"none\" points=\"-4,4 -4,-282.5 308,-282.5 308,4 -4,4\"/>\n",
       "<!-- p1 -->\n",
       "<g id=\"node1\" class=\"node\">\n",
       "<title>p1</title>\n",
       "<polygon fill=\"none\" stroke=\"black\" points=\"276.62,-278.5 27.38,-278.5 27.38,-242.5 276.62,-242.5 276.62,-278.5\"/>\n",
       "<text text-anchor=\"middle\" x=\"152\" y=\"-255.45\" font-family=\"Times New Roman,serif\" font-size=\"14.00\">FILTER BY col(\"pick_up\").is_in([Series])</text>\n",
       "</g>\n",
       "<!-- p2 -->\n",
       "<g id=\"node2\" class=\"node\">\n",
       "<title>p2</title>\n",
       "<polygon fill=\"none\" stroke=\"black\" points=\"231.62,-206.5 72.38,-206.5 72.38,-149 231.62,-149 231.62,-206.5\"/>\n",
       "<text text-anchor=\"middle\" x=\"152\" y=\"-189.2\" font-family=\"Times New Roman,serif\" font-size=\"14.00\">AGG [col(\"fare\").mean()]</text>\n",
       "<text text-anchor=\"middle\" x=\"152\" y=\"-172.7\" font-family=\"Times New Roman,serif\" font-size=\"14.00\">BY</text>\n",
       "<text text-anchor=\"middle\" x=\"152\" y=\"-156.2\" font-family=\"Times New Roman,serif\" font-size=\"14.00\">[col(\"pick_up\")]</text>\n",
       "</g>\n",
       "<!-- p1&#45;&#45;p2 -->\n",
       "<g id=\"edge1\" class=\"edge\">\n",
       "<title>p1--p2</title>\n",
       "<path fill=\"none\" stroke=\"black\" d=\"M152,-242.38C152,-232.1 152,-218.77 152,-206.97\"/>\n",
       "</g>\n",
       "<!-- p3 -->\n",
       "<g id=\"node3\" class=\"node\">\n",
       "<title>p3</title>\n",
       "<polygon fill=\"none\" stroke=\"black\" points=\"304,-113 0,-113 0,-77 304,-77 304,-113\"/>\n",
       "<text text-anchor=\"middle\" x=\"152\" y=\"-89.95\" font-family=\"Times New Roman,serif\" font-size=\"14.00\">FILTER BY [(col(\"pick_up\")) == (col(\"drop_off\"))]</text>\n",
       "</g>\n",
       "<!-- p2&#45;&#45;p3 -->\n",
       "<g id=\"edge2\" class=\"edge\">\n",
       "<title>p2--p3</title>\n",
       "<path fill=\"none\" stroke=\"black\" d=\"M152,-148.51C152,-136.68 152,-123.33 152,-113.04\"/>\n",
       "</g>\n",
       "<!-- p4 -->\n",
       "<g id=\"node4\" class=\"node\">\n",
       "<title>p4</title>\n",
       "<polygon fill=\"none\" stroke=\"black\" points=\"242.12,-41 61.88,-41 61.88,0 242.12,0 242.12,-41\"/>\n",
       "<text text-anchor=\"middle\" x=\"152\" y=\"-23.7\" font-family=\"Times New Roman,serif\" font-size=\"14.00\">Parquet SCAN [rides.parquet]</text>\n",
       "<text text-anchor=\"middle\" x=\"152\" y=\"-7.2\" font-family=\"Times New Roman,serif\" font-size=\"14.00\">π */5;</text>\n",
       "</g>\n",
       "<!-- p3&#45;&#45;p4 -->\n",
       "<g id=\"edge3\" class=\"edge\">\n",
       "<title>p3--p4</title>\n",
       "<path fill=\"none\" stroke=\"black\" d=\"M152,-76.82C152,-66.18 152,-52.49 152,-41.38\"/>\n",
       "</g>\n",
       "</g>\n",
       "</svg>"
      ],
      "text/plain": [
       "<IPython.core.display.SVG object>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "import polars as pl\n",
    "\n",
    "rides = pl.scan_parquet(\"rides.parquet\")\n",
    "\n",
    "(\n",
    "    rides.filter(pl.col(\"pick_up\") == pl.col(\"drop_off\"))\n",
    "    .group_by(pl.col(\"pick_up\"))\n",
    "    .agg(pl.col(\"fare\").mean())\n",
    "    .filter(\n",
    "        pl.col(\"pick_up\").is_in([\"Brooklyn\", \"Bronx\", \"Queens\", \"Manhattan\"])\n",
    "    )\n",
    ").show_graph(optimized=False)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "56ca5932-0c2e-45d9-8df7-8ae9da6da548",
   "metadata": {},
   "source": [
    "## Investigating the Optimized Query Plan"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "id": "f773dc12-283e-4f6e-9d5b-162662421ab6",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "image/svg+xml": [
       "<svg xmlns=\"http://www.w3.org/2000/svg\" xmlns:xlink=\"http://www.w3.org/1999/xlink\" width=\"460pt\" height=\"159pt\" viewBox=\"0.00 0.00 459.75 159.00\">\n",
       "<g id=\"graph0\" class=\"graph\" transform=\"scale(1 1) rotate(0) translate(4 155)\">\n",
       "<title>polars_query</title>\n",
       "<polygon fill=\"white\" stroke=\"none\" points=\"-4,4 -4,-155 455.75,-155 455.75,4 -4,4\"/>\n",
       "<!-- p1 -->\n",
       "<g id=\"node1\" class=\"node\">\n",
       "<title>p1</title>\n",
       "<polygon fill=\"none\" stroke=\"black\" points=\"305.5,-151 146.25,-151 146.25,-93.5 305.5,-93.5 305.5,-151\"/>\n",
       "<text text-anchor=\"middle\" x=\"225.88\" y=\"-133.7\" font-family=\"Times New Roman,serif\" font-size=\"14.00\">AGG [col(\"fare\").mean()]</text>\n",
       "<text text-anchor=\"middle\" x=\"225.88\" y=\"-117.2\" font-family=\"Times New Roman,serif\" font-size=\"14.00\">BY</text>\n",
       "<text text-anchor=\"middle\" x=\"225.88\" y=\"-100.7\" font-family=\"Times New Roman,serif\" font-size=\"14.00\">[col(\"pick_up\")]</text>\n",
       "</g>\n",
       "<!-- p2 -->\n",
       "<g id=\"node2\" class=\"node\">\n",
       "<title>p2</title>\n",
       "<polygon fill=\"none\" stroke=\"black\" points=\"451.75,-57.5 0,-57.5 0,0 451.75,0 451.75,-57.5\"/>\n",
       "<text text-anchor=\"middle\" x=\"225.88\" y=\"-40.2\" font-family=\"Times New Roman,serif\" font-size=\"14.00\">Parquet SCAN [rides.parquet]</text>\n",
       "<text text-anchor=\"middle\" x=\"225.88\" y=\"-23.7\" font-family=\"Times New Roman,serif\" font-size=\"14.00\">π 3/5;</text>\n",
       "<text text-anchor=\"middle\" x=\"225.88\" y=\"-7.2\" font-family=\"Times New Roman,serif\" font-size=\"14.00\">σ [(col(\"pick_up\").is_in([Series])) &amp; ([(col(\"pick_up\")) == (col(\"drop_off\"))])]</text>\n",
       "</g>\n",
       "<!-- p1&#45;&#45;p2 -->\n",
       "<g id=\"edge1\" class=\"edge\">\n",
       "<title>p1--p2</title>\n",
       "<path fill=\"none\" stroke=\"black\" d=\"M225.88,-93.3C225.88,-82.05 225.88,-69.11 225.88,-57.84\"/>\n",
       "</g>\n",
       "</g>\n",
       "</svg>"
      ],
      "text/plain": [
       "<IPython.core.display.SVG object>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "rides = pl.scan_parquet(\"rides.parquet\")\n",
    "\n",
    "(\n",
    "    rides.filter(pl.col(\"pick_up\") == pl.col(\"drop_off\"))\n",
    "    .group_by(pl.col(\"pick_up\"))\n",
    "    .agg(pl.col(\"fare\").mean())\n",
    "    .filter(\n",
    "        pl.col(\"pick_up\").is_in([\"Brooklyn\", \"Bronx\", \"Queens\", \"Manhattan\"])\n",
    "    )\n",
    ").show_graph(optimized=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "id": "dd581a09-b243-4141-8fec-1f01eac567f8",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Schema([('pick_up', String),\n",
       "        ('drop_off', String),\n",
       "        ('passengers', Int32),\n",
       "        ('distance', Int32),\n",
       "        ('fare', Int32)])"
      ]
     },
     "execution_count": 10,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "rides.collect_schema()"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "b5baec65-1fd9-48ea-b315-54fd49d8330e",
   "metadata": {},
   "source": [
    "# How LazyFrames Cope With Large Volumes of Data\n",
    "## Using Streaming to Deal With Large Volumes\n",
    "This code may crash your computer. Please ensure you save everything before running it. Also, you should not run this code within a notebook. Notebooks have their own memory configuration that is separate from your computer."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "f8be883e-07e7-456c-9abf-4eb59d66220b",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'Code finished in 0.2538 seconds.'"
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "import polars as pl\n",
    "\n",
    "rides = pl.scan_csv(\"2021_Yellow_Taxi_Trip_Data.csv\")\n",
    "\n",
    "print(\n",
    "    (\n",
    "        rides.group_by(pl.col(\"PULocationID\")).agg(\n",
    "            pl.col(\"total_amount\").sum()\n",
    "        )\n",
    "    ).collect(streaming=True)\n",
    ")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "3ca629d3-2abe-4ae9-b81e-be1f99e86a85",
   "metadata": {},
   "source": [
    "## Deciding When Streaming Should Be Used"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "id": "79f96a4c-87fa-4c02-88a5-3f2a3c201a8e",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "STREAMING:\n",
      "  AGGREGATE\n",
      "  \t[col(\"drop_off\").mean()] BY [col(\"pick_up\")] FROM\n",
      "    Parquet SCAN [rides.parquet]\n",
      "    PROJECT 2/5 COLUMNS\n",
      "    SELECTION: [(col(\"pick_up\").is_in([Series])) & ([(col(\"pick_up\")) == (col(\"drop_off\"))])]\n"
     ]
    }
   ],
   "source": [
    "import polars as pl\n",
    "\n",
    "rides = pl.scan_parquet(\"rides.parquet\")\n",
    "\n",
    "print(\n",
    "    (\n",
    "        rides.filter(pl.col(\"pick_up\") == pl.col(\"drop_off\"))\n",
    "        .group_by(pl.col(\"pick_up\"))\n",
    "        .agg(pl.col(\"drop_off\").mean())\n",
    "        .filter(\n",
    "            pl.col(\"pick_up\").is_in(\n",
    "                [\"Brooklyn\", \"Bronx\", \"Queens\", \"Manhattan\"]\n",
    "            )\n",
    "        )\n",
    "    ).explain(streaming=True, optimized=True)\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "ec4a5be0-48a0-4657-b844-a2b2a53f2e27",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      " WITH_COLUMNS:\n",
      " [col(\"fare\").mean().over([col(\"pick_up\")]).alias(\"mean fare\")] \n",
      "  STREAMING:\n",
      "    Parquet SCAN [rides.parquet]\n",
      "    PROJECT */5 COLUMNS\n",
      "    SELECTION: [(col(\"pick_up\")) == (col(\"drop_off\"))]\n"
     ]
    }
   ],
   "source": [
    "import polars as pl\n",
    "\n",
    "rides = pl.scan_parquet(\"rides.parquet\")\n",
    "\n",
    "print(\n",
    "    (\n",
    "        rides.filter(pl.col(\"pick_up\") == pl.col(\"drop_off\")).with_columns(\n",
    "            pl.col(\"fare\").mean().over(pl.col(\"pick_up\")).alias(\"mean fare\")\n",
    "        )\n",
    "    ).explain(streaming=True, optimized=True)\n",
    ")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "f3a9ef30-e20a-42d7-9bb6-2e5b225f40b8",
   "metadata": {},
   "source": [
    "# How to Decide if LazyFrames Are Indeed Suitable"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "id": "ca70f902-3aba-4459-ad9b-4560f1434d8f",
   "metadata": {},
   "outputs": [
    {
     "ename": "AttributeError",
     "evalue": "'LazyFrame' object has no attribute 'pivot'",
     "output_type": "error",
     "traceback": [
      "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
      "\u001b[1;31mAttributeError\u001b[0m                            Traceback (most recent call last)",
      "Cell \u001b[1;32mIn[13], line 5\u001b[0m\n\u001b[0;32m      1\u001b[0m \u001b[38;5;28;01mimport\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;21;01mpolars\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;28;01mas\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;21;01mpl\u001b[39;00m\n\u001b[0;32m      3\u001b[0m rides \u001b[38;5;241m=\u001b[39m pl\u001b[38;5;241m.\u001b[39mscan_parquet(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mrides.parquet\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[1;32m----> 5\u001b[0m \u001b[43mrides\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mpivot\u001b[49m(\n\u001b[0;32m      6\u001b[0m     on\u001b[38;5;241m=\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mpick_up\u001b[39m\u001b[38;5;124m\"\u001b[39m,\n\u001b[0;32m      7\u001b[0m     index\u001b[38;5;241m=\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mdrop_off\u001b[39m\u001b[38;5;124m\"\u001b[39m,\n\u001b[0;32m      8\u001b[0m     values\u001b[38;5;241m=\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mpassengers\u001b[39m\u001b[38;5;124m\"\u001b[39m,\n\u001b[0;32m      9\u001b[0m     aggregate_function\u001b[38;5;241m=\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124msum\u001b[39m\u001b[38;5;124m\"\u001b[39m,\n\u001b[0;32m     10\u001b[0m )\n",
      "\u001b[1;31mAttributeError\u001b[0m: 'LazyFrame' object has no attribute 'pivot'"
     ]
    }
   ],
   "source": [
    "import polars as pl\n",
    "\n",
    "rides = pl.scan_parquet(\"rides.parquet\")\n",
    "\n",
    "rides.pivot(\n",
    "    on=\"pick_up\",\n",
    "    index=\"drop_off\",\n",
    "    values=\"passengers\",\n",
    "    aggregate_function=\"sum\",\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "id": "40bbe881-9b18-4fca-a8aa-f65e6167f677",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div><style>\n",
       ".dataframe > thead > tr,\n",
       ".dataframe > tbody > tr {\n",
       "  text-align: right;\n",
       "  white-space: pre-wrap;\n",
       "}\n",
       "</style>\n",
       "<small>shape: (8, 9)</small><table border=\"1\" class=\"dataframe\"><thead><tr><th>drop_off</th><th>Manhattan</th><th>Queens</th><th>Unknown</th><th>Brooklyn</th><th>N/A</th><th>Bronx</th><th>EWR</th><th>Staten Island</th></tr><tr><td>str</td><td>i32</td><td>i32</td><td>i32</td><td>i32</td><td>i32</td><td>i32</td><td>i32</td><td>i32</td></tr></thead><tbody><tr><td>&quot;Manhattan&quot;</td><td>3041648</td><td>256376</td><td>4686</td><td>9736</td><td>113</td><td>3749</td><td>14</td><td>11</td></tr><tr><td>&quot;Queens&quot;</td><td>111178</td><td>115640</td><td>629</td><td>3442</td><td>148</td><td>830</td><td>1</td><td>3</td></tr><tr><td>&quot;Brooklyn&quot;</td><td>68712</td><td>66116</td><td>333</td><td>18854</td><td>20</td><td>924</td><td>1</td><td>38</td></tr><tr><td>&quot;Bronx&quot;</td><td>12006</td><td>10396</td><td>59</td><td>862</td><td>23</td><td>1785</td><td>null</td><td>null</td></tr><tr><td>&quot;Unknown&quot;</td><td>9668</td><td>1016</td><td>7512</td><td>50</td><td>37</td><td>13</td><td>10</td><td>null</td></tr><tr><td>&quot;N/A&quot;</td><td>6312</td><td>17630</td><td>66</td><td>146</td><td>2974</td><td>82</td><td>25</td><td>5</td></tr><tr><td>&quot;Staten Island&quot;</td><td>346</td><td>718</td><td>2</td><td>90</td><td>null</td><td>null</td><td>null</td><td>45</td></tr><tr><td>&quot;EWR&quot;</td><td>12759</td><td>806</td><td>27</td><td>47</td><td>131</td><td>6</td><td>1034</td><td>3</td></tr></tbody></table></div>"
      ],
      "text/plain": [
       "shape: (8, 9)\n",
       "┌───────────────┬───────────┬────────┬─────────┬───┬──────┬───────┬──────┬───────────────┐\n",
       "│ drop_off      ┆ Manhattan ┆ Queens ┆ Unknown ┆ … ┆ N/A  ┆ Bronx ┆ EWR  ┆ Staten Island │\n",
       "│ ---           ┆ ---       ┆ ---    ┆ ---     ┆   ┆ ---  ┆ ---   ┆ ---  ┆ ---           │\n",
       "│ str           ┆ i32       ┆ i32    ┆ i32     ┆   ┆ i32  ┆ i32   ┆ i32  ┆ i32           │\n",
       "╞═══════════════╪═══════════╪════════╪═════════╪═══╪══════╪═══════╪══════╪═══════════════╡\n",
       "│ Manhattan     ┆ 3041648   ┆ 256376 ┆ 4686    ┆ … ┆ 113  ┆ 3749  ┆ 14   ┆ 11            │\n",
       "│ Queens        ┆ 111178    ┆ 115640 ┆ 629     ┆ … ┆ 148  ┆ 830   ┆ 1    ┆ 3             │\n",
       "│ Brooklyn      ┆ 68712     ┆ 66116  ┆ 333     ┆ … ┆ 20   ┆ 924   ┆ 1    ┆ 38            │\n",
       "│ Bronx         ┆ 12006     ┆ 10396  ┆ 59      ┆ … ┆ 23   ┆ 1785  ┆ null ┆ null          │\n",
       "│ Unknown       ┆ 9668      ┆ 1016   ┆ 7512    ┆ … ┆ 37   ┆ 13    ┆ 10   ┆ null          │\n",
       "│ N/A           ┆ 6312      ┆ 17630  ┆ 66      ┆ … ┆ 2974 ┆ 82    ┆ 25   ┆ 5             │\n",
       "│ Staten Island ┆ 346       ┆ 718    ┆ 2       ┆ … ┆ null ┆ null  ┆ null ┆ 45            │\n",
       "│ EWR           ┆ 12759     ┆ 806    ┆ 27      ┆ … ┆ 131  ┆ 6     ┆ 1034 ┆ 3             │\n",
       "└───────────────┴───────────┴────────┴─────────┴───┴──────┴───────┴──────┴───────────────┘"
      ]
     },
     "execution_count": 14,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "(\n",
    "    rides.collect().pivot(\n",
    "        on=\"pick_up\",\n",
    "        index=\"drop_off\",\n",
    "        values=\"passengers\",\n",
    "        aggregate_function=\"sum\",\n",
    "    )\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "id": "d8797e95-1071-4b71-84a6-3c0fe1063cc5",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div><style>\n",
       ".dataframe > thead > tr,\n",
       ".dataframe > tbody > tr {\n",
       "  text-align: right;\n",
       "  white-space: pre-wrap;\n",
       "}\n",
       "</style>\n",
       "<small>shape: (3, 4)</small><table border=\"1\" class=\"dataframe\"><thead><tr><th>drop_off</th><th>Queens</th><th>Brooklyn</th><th>Bronx</th></tr><tr><td>str</td><td>i32</td><td>i32</td><td>i32</td></tr></thead><tbody><tr><td>&quot;Brooklyn&quot;</td><td>66116</td><td>18854</td><td>924</td></tr><tr><td>&quot;Queens&quot;</td><td>115640</td><td>3442</td><td>830</td></tr><tr><td>&quot;Bronx&quot;</td><td>10396</td><td>862</td><td>1785</td></tr></tbody></table></div>"
      ],
      "text/plain": [
       "shape: (3, 4)\n",
       "┌──────────┬────────┬──────────┬───────┐\n",
       "│ drop_off ┆ Queens ┆ Brooklyn ┆ Bronx │\n",
       "│ ---      ┆ ---    ┆ ---      ┆ ---   │\n",
       "│ str      ┆ i32    ┆ i32      ┆ i32   │\n",
       "╞══════════╪════════╪══════════╪═══════╡\n",
       "│ Brooklyn ┆ 66116  ┆ 18854    ┆ 924   │\n",
       "│ Queens   ┆ 115640 ┆ 3442     ┆ 830   │\n",
       "│ Bronx    ┆ 10396  ┆ 862      ┆ 1785  │\n",
       "└──────────┴────────┴──────────┴───────┘"
      ]
     },
     "execution_count": 15,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "(\n",
    "    rides.filter(pl.col(\"pick_up\").is_in([\"Brooklyn\", \"Bronx\", \"Queens\"]))\n",
    "    .filter(pl.col(\"drop_off\").is_in([\"Brooklyn\", \"Bronx\", \"Queens\"]))\n",
    "    .collect()\n",
    "    .pivot(\n",
    "        on=\"pick_up\",\n",
    "        index=\"drop_off\",\n",
    "        values=\"passengers\",\n",
    "        aggregate_function=\"sum\",\n",
    "    )\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "id": "0f17622f-c69d-4b3c-8113-0a6aec912089",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div><style>\n",
       ".dataframe > thead > tr,\n",
       ".dataframe > tbody > tr {\n",
       "  text-align: right;\n",
       "  white-space: pre-wrap;\n",
       "}\n",
       "</style>\n",
       "<small>shape: (1, 4)</small><table border=\"1\" class=\"dataframe\"><thead><tr><th>drop_off</th><th>Queens</th><th>Brooklyn</th><th>Bronx</th></tr><tr><td>str</td><td>i32</td><td>i32</td><td>i32</td></tr></thead><tbody><tr><td>&quot;Queens&quot;</td><td>115640</td><td>18854</td><td>1785</td></tr></tbody></table></div>"
      ],
      "text/plain": [
       "shape: (1, 4)\n",
       "┌──────────┬────────┬──────────┬───────┐\n",
       "│ drop_off ┆ Queens ┆ Brooklyn ┆ Bronx │\n",
       "│ ---      ┆ ---    ┆ ---      ┆ ---   │\n",
       "│ str      ┆ i32    ┆ i32      ┆ i32   │\n",
       "╞══════════╪════════╪══════════╪═══════╡\n",
       "│ Queens   ┆ 115640 ┆ 18854    ┆ 1785  │\n",
       "└──────────┴────────┴──────────┴───────┘"
      ]
     },
     "execution_count": 16,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "(\n",
    "    rides.filter(pl.col(\"pick_up\").is_in([\"Brooklyn\", \"Bronx\", \"Queens\"]))\n",
    "    .filter(pl.col(\"drop_off\").is_in([\"Brooklyn\", \"Bronx\", \"Queens\"]))\n",
    "    .collect()\n",
    "    .pivot(\n",
    "        on=\"pick_up\",\n",
    "        index=\"drop_off\",\n",
    "        values=\"passengers\",\n",
    "        aggregate_function=\"sum\",\n",
    "    )\n",
    "    .lazy()\n",
    "    .select(pl.max(\"*\"))\n",
    ").collect()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "eb0bdf0d-7547-40dd-8ad5-9410ec34b9ed",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "['corr', 'drop_in_place', 'equals', 'estimated_size', 'extend', 'flags', 'fold', 'get_column', 'get_column_index', 'get_columns', 'glimpse', 'hash_rows', 'height', 'hstack', 'insert_column', 'is_duplicated', 'is_empty', 'is_unique', 'item', 'iter_columns', 'iter_rows', 'iter_slices', 'map_rows', 'max_horizontal', 'mean_horizontal', 'min_horizontal', 'n_chunks', 'n_unique', 'partition_by', 'pivot', 'plot', 'product', 'rechunk', 'replace_column', 'row', 'rows', 'rows_by_key', 'sample', 'shape', 'shrink_to_fit', 'style', 'sum_horizontal', 'to_arrow', 'to_dict', 'to_dicts', 'to_dummies', 'to_init_repr', 'to_jax', 'to_numpy', 'to_pandas', 'to_series', 'to_struct', 'to_torch', 'transpose', 'unstack', 'upsample', 'vstack', 'write_avro', 'write_clipboard', 'write_csv', 'write_database', 'write_delta', 'write_excel', 'write_ipc', 'write_ipc_stream', 'write_json', 'write_ndjson', 'write_parquet']\n"
     ]
    }
   ],
   "source": [
    "df_ops = set(x for x in dir(pl.DataFrame()) if not x.startswith(\"_\"))\n",
    "lazy_ops = set(x for x in dir(pl.LazyFrame()) if not x.startswith(\"_\"))\n",
    "print(sorted(df_ops - lazy_ops))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "98f141b2-3ed1-4dd0-91db-0573896153dc",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.13.1"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
